Processamento dos dados
data_files_selected <-
list.files(
pattern = ".*thais_.*csv$",
recursive = TRUE,
ignore.case = TRUE
)
data_files_and_size <-
sapply(data_files_selected, file.size)
files_to_include_in_dataframe <-
tibble(
"Files" = names(data_files_and_size),
"Size (in MB)" = data_files_and_size/1E6
)
skimr::skim(files_to_include_in_dataframe)
Data summary
| Name |
files_to_include_in_dataf… |
| Number of rows |
24 |
| Number of columns |
2 |
| _______________________ |
|
| Column type frequency: |
|
| character |
1 |
| numeric |
1 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
Variable type: numeric
| Size (in MB) |
0 |
1 |
2.18 |
3.48 |
0.32 |
0.38 |
0.6 |
2.1 |
10.99 |
▇▁▁▁▁ |
cdr <- load_cdr(names(data_files_and_size))
cdr %<>%
mutate(
expgroup = case_when(
str_detect(file, "thaisnovoheader") ~ "nh",
str_detect(file, "thais_29") ~ "29",
str_detect(file, "thais_66") ~ "66",
TRUE ~ "unknown"),
cycle = case_when(
str_detect(file, "R0_R2") ~ "R0_R2",
str_detect(file, "R0_R3") ~ "R0_R3",
str_detect(file, "R0_R4") ~ "R0_R4",
str_detect(file, "R2_R3") ~ "R2_R3",
str_detect(file, "R2_R4") ~ "R2_R4",
str_detect(file, "R3_R4") ~ "R3_R4",
TRUE ~ "unknown"),
time = case_when(
str_detect(file, "Initial") ~ "initial",
str_detect(file, "Final") ~ "final",
TRUE ~ "unknown")) %>%
select(cdr3, cycle, time, expgroup, everything())
cdr %<>%
group_by(cdr3, expgroup, cycle) %>%
arrange(desc(time), .by_group = TRUE) %>%
mutate(
fcp = cdrp / lag(cdrp, default = first(cdrp)),
fcq = quantity / lag(quantity, default = first(quantity))
) %>%
select(cdr3:quantity, fcp, fcq, everything())
cdr %>%
filter(time == "final") %>%
# filter(str_detect(cycle, "R0")) %>%
group_by(expgroup, cycle, time) %>%
arrange(desc(fcp)) %>%
slice_head(prop = .1) %>%
# slice_head(n = 1000) %>%
ggplot(aes(expgroup, log10(fcp))) +
geom_violin(aes(fill = expgroup, color = expgroup), alpha = 0.5) +
geom_jitter(aes(shape = expgroup), alpha = 0.6, size = 1) +
stat_summary(
fun = mean,
fun.min = mean,
fun.max = mean,
geom = "crossbar",
# width = 0.5,
aes(color = expgroup)
) +
facet_grid(. ~ cycle)

# cdr %>%
# filter(str_detect(cycle, "R0")) %>%
# filter(time == "final") %>%
# group_by(expgroup, cycle, time) %>%
# arrange(desc(fcp)) %>%
# slice_head(n = 1000) %>%
# ggplot(aes(fcp, color = expgroup, fill = expgroup)) +
# geom_density(stat = "bin", alpha = 0.3) +
# facet_grid(cycle ~ expgroup)
cdr %<>%
group_by(expgroup, cycle, time) %>%
arrange(desc(fcp)) %>%
slice_head(prop = .1) %>%
mutate(
threshold = mean(log10(fcp))
) %>%
mutate(
rich = if_else(
(log10(fcp) >= threshold) &
# (time == "final") &
# (str_detect(cycle, "R0")),
(time == "final"),
"rich",
"medium")) %>%
full_join(cdr) %>%
mutate(
rich = if_else(
is.na(rich),
"poor",
rich)) %>%
mutate(
rich = factor(rich,
levels = c("rich", "medium", "poor"))
) %>%
mutate(
threshold = if_else(
is.na(threshold),
0,
threshold
)
)
cdr %>%
filter(rich == "rich") %>%
ggplot() +
geom_violin(aes(expgroup, log10(fcp), fill = expgroup)) +
geom_jitter(aes(expgroup, log10(fcp), shape = expgroup), alpha = .2) +
facet_grid(rich ~ cycle)

cdr %>%
filter(!rich == "rich") %>%
ggplot() +
geom_violin(aes(expgroup, log10(fcp), fill = expgroup)) +
geom_jitter(aes(expgroup, log10(fcp), shape = expgroup), alpha = .2) +
facet_grid(rich ~ cycle)

Tentativa de Clusterização
Clusterização Experimento 29
library(Rtsne)
set.seed(42)
tsne_df <- cdr %>%
# filter(!rich == "poor") %>%
filter(time == "final") %>%
filter(cycle == "R0_R4") %>%
filter(expgroup == "29")
tsne_df %>%
summarise(across(everything(), ~ all(sum(is.na(.x))))) %>%
select(where(isTRUE))
## # A tibble: 1 x 2
## # Groups: expgroup, cycle [1]
## expgroup cycle
## <chr> <chr>
## 1 29 R0_R4
tsne_df %>%
filter(is.na(threshold)) %>%
select(rich, threshold, quantity)
## # A tibble: 0 x 6
## # Groups: expgroup, cycle, time [0]
## # … with 6 variables: expgroup <chr>, cycle <chr>, time <chr>, rich <fct>,
## # threshold <dbl>, quantity <int>
set.seed(42)
tsne_out <-
tsne_df %>%
ungroup() %>%
select(!where(is.character)) %>%
select(!c(which(apply(., 2, var)==0))) %>%
unique() %>%
Rtsne(
X = .,
dims = 3,
perplexity = 420,
theta = 0.1,
max_iter = 2E3,
verbose = T,
pca_center = T,
pca_scale = T,
normalize = T,
eta = 200.0,
exaggeration_factor = 12.0,
num_threads = parallel::detectCores() - 2
)
## Performing PCA
## Read the 1544 x 42 data matrix successfully!
## OpenMP is working. 6 threads.
## Using no_dims = 3, perplexity = 420.000000, and theta = 0.100000
## Computing input similarities...
## Building tree...
## Done in 10.13 seconds (sparsity = 0.932734)!
## Learning embedding...
## Iteration 50: error is 44.145871 (50 iterations in 15.87 seconds)
## Iteration 100: error is 44.145871 (50 iterations in 18.00 seconds)
## Iteration 150: error is 44.145871 (50 iterations in 20.99 seconds)
## Iteration 200: error is 44.145871 (50 iterations in 22.81 seconds)
## Iteration 250: error is 44.145871 (50 iterations in 25.42 seconds)
## Iteration 300: error is 1.193916 (50 iterations in 21.90 seconds)
## Iteration 350: error is 0.319184 (50 iterations in 11.58 seconds)
## Iteration 400: error is 0.241505 (50 iterations in 11.28 seconds)
## Iteration 450: error is 0.229975 (50 iterations in 10.10 seconds)
## Iteration 500: error is 0.226926 (50 iterations in 10.35 seconds)
## Iteration 550: error is 0.226181 (50 iterations in 10.55 seconds)
## Iteration 600: error is 0.225912 (50 iterations in 10.54 seconds)
## Iteration 650: error is 0.225838 (50 iterations in 10.27 seconds)
## Iteration 700: error is 0.225819 (50 iterations in 10.23 seconds)
## Iteration 750: error is 0.225798 (50 iterations in 10.05 seconds)
## Iteration 800: error is 0.225781 (50 iterations in 10.04 seconds)
## Iteration 850: error is 0.225778 (50 iterations in 10.17 seconds)
## Iteration 900: error is 0.225771 (50 iterations in 10.58 seconds)
## Iteration 950: error is 0.225767 (50 iterations in 10.08 seconds)
## Iteration 1000: error is 0.225760 (50 iterations in 9.78 seconds)
## Iteration 1050: error is 0.225743 (50 iterations in 10.27 seconds)
## Iteration 1100: error is 0.225722 (50 iterations in 10.02 seconds)
## Iteration 1150: error is 0.225723 (50 iterations in 10.27 seconds)
## Iteration 1200: error is 0.225719 (50 iterations in 9.85 seconds)
## Iteration 1250: error is 0.225719 (50 iterations in 9.81 seconds)
## Iteration 1300: error is 0.225725 (50 iterations in 10.28 seconds)
## Iteration 1350: error is 0.225728 (50 iterations in 10.11 seconds)
## Iteration 1400: error is 0.225729 (50 iterations in 10.30 seconds)
## Iteration 1450: error is 0.225728 (50 iterations in 10.00 seconds)
## Iteration 1500: error is 0.225730 (50 iterations in 10.32 seconds)
## Iteration 1550: error is 0.225729 (50 iterations in 10.34 seconds)
## Iteration 1600: error is 0.225730 (50 iterations in 9.85 seconds)
## Iteration 1650: error is 0.225734 (50 iterations in 10.03 seconds)
## Iteration 1700: error is 0.225733 (50 iterations in 10.53 seconds)
## Iteration 1750: error is 0.225732 (50 iterations in 9.77 seconds)
## Iteration 1800: error is 0.225733 (50 iterations in 10.20 seconds)
## Iteration 1850: error is 0.225730 (50 iterations in 9.96 seconds)
## Iteration 1900: error is 0.225729 (50 iterations in 10.14 seconds)
## Iteration 1950: error is 0.225730 (50 iterations in 10.35 seconds)
## Iteration 2000: error is 0.225735 (50 iterations in 9.97 seconds)
## Fitting performed in 472.95 seconds.
tsne_df %>%
ungroup() %>%
select(!where(is.character)) %>%
select(!c(which(apply(., 2, var)==0))) %>%
unique() -> a
tsne_out %>%
.$Y %>%
as_tibble() %>%
ggplot() +
geom_point(aes(V1, V2, color = a$rich))

tsne_out %>%
.$Y %>%
as_tibble() %>%
plot_ly(
title = "Sample title",
x = .$V1,
y = .$V2,
z = .$V3,
type = "scatter3d",
mode = "markers",
color = a$rich
) %>%
layout(title = "Experiment 29")
tsne_out %>%
.$Y %>%
as_tibble() %>%
plot_ly(
title = "Sample title",
x = .$V1,
y = .$V2,
z = .$V3,
type = "scatter3d",
mode = "markers",
color = a$rich
) %>%
layout(title = "Experiment 29")
Clusterização Experimento 66
library(Rtsne)
set.seed(42)
tsne_df <- cdr %>%
# filter(!rich == "poor") %>%
filter(time == "final") %>%
filter(cycle == "R0_R4") %>%
filter(expgroup == "66")
tsne_df %>%
summarise(across(everything(), ~ all(sum(is.na(.x))))) %>%
select(where(isTRUE))
## # A tibble: 1 x 2
## # Groups: expgroup, cycle [1]
## expgroup cycle
## <chr> <chr>
## 1 66 R0_R4
tsne_df %>%
filter(is.na(threshold)) %>%
select(rich, threshold, quantity)
## # A tibble: 0 x 6
## # Groups: expgroup, cycle, time [0]
## # … with 6 variables: expgroup <chr>, cycle <chr>, time <chr>, rich <fct>,
## # threshold <dbl>, quantity <int>
set.seed(42)
tsne_out <-
tsne_df %>%
ungroup() %>%
select(!where(is.character)) %>%
select(!c(which(apply(., 2, var)==0))) %>%
unique() %>%
Rtsne(
X = .,
dims = 3,
perplexity = 30,
theta = 0.5,
max_iter = 1E3,
verbose = T,
pca_center = T,
pca_scale = T,
normalize = T,
eta = 200.0,
exaggeration_factor = 12.0,
num_threads = parallel::detectCores() - 2
)
## Performing PCA
## Read the 8436 x 42 data matrix successfully!
## OpenMP is working. 6 threads.
## Using no_dims = 3, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 14.15 seconds (sparsity = 0.015493)!
## Learning embedding...
## Iteration 50: error is 95.444063 (50 iterations in 25.25 seconds)
## Iteration 100: error is 92.761432 (50 iterations in 37.79 seconds)
## Iteration 150: error is 91.017719 (50 iterations in 19.85 seconds)
## Iteration 200: error is 90.890341 (50 iterations in 16.90 seconds)
## Iteration 250: error is 90.831025 (50 iterations in 17.12 seconds)
## Iteration 300: error is 2.973973 (50 iterations in 13.09 seconds)
## Iteration 350: error is 2.540478 (50 iterations in 12.58 seconds)
## Iteration 400: error is 2.310407 (50 iterations in 13.03 seconds)
## Iteration 450: error is 2.166549 (50 iterations in 13.19 seconds)
## Iteration 500: error is 2.067423 (50 iterations in 13.28 seconds)
## Iteration 550: error is 1.995633 (50 iterations in 13.70 seconds)
## Iteration 600: error is 1.942256 (50 iterations in 13.50 seconds)
## Iteration 650: error is 1.902282 (50 iterations in 13.35 seconds)
## Iteration 700: error is 1.871328 (50 iterations in 12.91 seconds)
## Iteration 750: error is 1.847140 (50 iterations in 13.84 seconds)
## Iteration 800: error is 1.828978 (50 iterations in 13.87 seconds)
## Iteration 850: error is 1.815785 (50 iterations in 14.25 seconds)
## Iteration 900: error is 1.805099 (50 iterations in 13.57 seconds)
## Iteration 950: error is 1.796396 (50 iterations in 14.03 seconds)
## Iteration 1000: error is 1.788672 (50 iterations in 14.26 seconds)
## Fitting performed in 319.36 seconds.
tsne_df %>%
ungroup() %>%
select(!where(is.character)) %>%
select(!c(which(apply(., 2, var)==0))) %>%
unique() -> a
tsne_out %>%
.$Y %>%
as_tibble() %>%
ggplot() +
geom_point(aes(V1, V2, color = a$rich))

tsne_out %>%
.$Y %>%
as_tibble() %>%
plot_ly(
x = .$V1,
y = .$V2,
z = .$V3,
type = "scatter3d",
mode = "markers",
color = a$rich
) %>%
layout(title = "Experiment 66")